In [1]:
# https://scanpy.readthedocs.io/en/stable/tutorials/spatial/basic-analysis.html
# https://squidpy.readthedocs.io/en/stable/
# used data : https://support.10xgenomics.com/spatial-gene-expression/datasets/1.0.0/V1_Human_Lymph_Node
In [2]:
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
np.random.seed(42)
In [3]:
sc.logging.print_versions()
sc.set_figure_params(facecolor="white", figsize=(8, 8))
sc.settings.verbosity = 3
----- anndata 0.11.1 scanpy 1.10.4 ----- PIL 11.0.0 anyio NA arrow 1.3.0 asciitree NA asttokens NA attr 24.2.0 attrs 24.2.0 babel 2.16.0 certifi 2024.08.30 charset_normalizer 3.3.2 cloudpickle 3.1.0 colorama 0.4.6 comm 0.2.2 cycler 0.12.1 cython_runtime NA dask 2024.12.0 dateutil 2.9.0.post0 debugpy 1.8.6 decorator 5.1.1 defusedxml 0.7.1 executing 2.1.0 fastjsonschema NA fqdn NA h5py 3.12.1 idna 3.10 igraph 0.11.8 ipykernel 6.29.5 ipywidgets 8.1.5 isoduration NA jedi 0.19.1 jinja2 3.1.4 joblib 1.4.2 json5 0.9.25 jsonpointer 3.0.0 jsonschema 4.23.0 jsonschema_specifications NA jupyter_events 0.10.0 jupyter_server 2.14.2 jupyterlab_server 2.27.3 kiwisolver 1.4.7 legacy_api_wrap NA leidenalg 0.10.2 llvmlite 0.43.0 markupsafe 2.1.5 matplotlib 3.9.3 mpl_toolkits NA msgpack 1.1.0 natsort 8.4.0 nbformat 5.10.4 numba 0.60.0 numcodecs 0.14.1 numpy 1.26.4 overrides NA packaging 24.1 pandas 2.2.3 parso 0.8.4 patsy 1.0.1 platformdirs 4.3.6 prometheus_client NA prompt_toolkit 3.0.48 psutil 6.0.0 pure_eval 0.2.3 pyarrow 18.1.0 pydev_ipython NA pydevconsole NA pydevd 3.1.0 pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.18.0 pyparsing 3.2.0 pythoncom NA pythonjsonlogger NA pytz 2024.2 pywin32_bootstrap NA pywin32_system32 NA pywintypes NA referencing NA requests 2.32.3 rfc3339_validator 0.1.4 rfc3986_validator 0.1.1 rpds NA scipy 1.14.1 seaborn 0.13.2 send2trash NA session_info 1.0.0 six 1.16.0 sklearn 1.5.2 sniffio 1.3.1 stack_data 0.6.3 statsmodels 0.14.4 tblib 3.0.0 texttable 1.7.0 threadpoolctl 3.5.0 tlz 1.0.0 toolz 1.0.0 tornado 6.4.1 traitlets 5.14.3 uri_template NA urllib3 1.26.20 wcwidth 0.2.13 webcolors 24.8.0 websocket 1.8.0 win32api NA win32com NA win32con NA win32trace NA winerror NA yaml 6.0.2 zarr 2.18.3 zmq 26.2.0 ----- IPython 8.28.0 jupyter_client 8.6.3 jupyter_core 5.7.2 jupyterlab 4.2.5 notebook 7.2.2 ----- Python 3.12.7 (tags/v3.12.7:0b05ead, Oct 1 2024, 03:06:41) [MSC v.1941 64 bit (AMD64)] Windows-11-10.0.22631-SP0 ----- Session information updated at 2024-12-05 14:17
In [ ]:
In [4]:
print("Reading the data:")
Reading the data:
In [5]:
adata = sc.datasets.visium_sge(sample_id="V1_Human_Lymph_Node")
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)
reading C:\Users\user\Desktop\Spatial_Transcriptomics\data\V1_Human_Lymph_Node\filtered_feature_bc_matrix.h5
C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\anndata\_core\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
(0:00:00)
C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\anndata\_core\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
In [6]:
str(adata)
Out[6]:
"AnnData object with n_obs × n_vars = 4035 × 36601\n obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt'\n var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'\n uns: 'spatial'\n obsm: 'spatial'"
In [ ]:
In [7]:
print('QC and preprocessing:')
QC and preprocessing:
In [8]:
fig, axs = plt.subplots(1, 4, figsize=(15, 4))
sns.histplot(adata.obs["total_counts"], kde=False, ax=axs[0])
sns.histplot(
adata.obs["total_counts"][adata.obs["total_counts"] < 10000],
kde=False,
bins=40,
ax=axs[1],
)
sns.histplot(adata.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
sns.histplot(
adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < 4000],
kde=False,
bins=60,
ax=axs[3],
)
Out[8]:
<Axes: xlabel='n_genes_by_counts', ylabel='Count'>
In [9]:
sc.pp.filter_cells(adata, min_counts=5000)
sc.pp.filter_cells(adata, max_counts=35000)
adata = adata[adata.obs["pct_counts_mt"] < 20].copy()
print(f"#cells after MT filter: {adata.n_obs}")
sc.pp.filter_genes(adata, min_cells=10)
filtered out 44 cells that have less than 5000 counts filtered out 130 cells that have more than 35000 counts #cells after MT filter: 3861 filtered out 16916 genes that are detected in less than 10 cells
In [ ]:
In [10]:
print('Normalization :')
Normalization :
In [11]:
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2000)
normalizing counts per cell
finished (0:00:00)
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
In [ ]:
In [12]:
print("Manifold embedding and clustering based on transcriptional similarity :")
Manifold embedding and clustering based on transcriptional similarity :
In [13]:
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(
adata,
key_added="clusters",
directed=False,
n_iterations=2
)
computing PCA
with n_comps=50
finished (0:00:01)
computing neighbors
using 'X_pca' with n_pcs = 50
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:08)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm)
'umap', UMAP parameters (adata.uns) (0:00:08)
C:\Users\user\AppData\Local\Temp\ipykernel_2264\3081113148.py:5: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg. To achieve the future defaults please pass: flavor="igraph" and n_iterations=2. directed must also be False to work with igraph's implementation. sc.tl.leiden(
running Leiden clustering
finished: found 10 clusters and added
'clusters', the cluster labels (adata.obs, categorical) (0:00:00)
In [14]:
# Visualize the UMAP plot
# sc.pl.umap(adata)
In [15]:
plt.rcParams["figure.figsize"] = (4, 4)
sc.pl.umap(adata, color=["total_counts", "n_genes_by_counts", "clusters"], wspace=0.4)
In [16]:
print("Visualization in spatial coordinates :")
Visualization in spatial coordinates :
In [17]:
# using sc.pl.spatial :
# img_key: key where the img is stored in the adata.uns element
# crop_coord: coordinates to use for cropping (left, right, top, bottom)
# alpha_img: alpha value for the transcparency of the image
# bw: flag to convert the image into gray scale
# size parameter : it becomes a scaling factor for the spot sizes
plt.rcParams["figure.figsize"] = (4, 4)
sc.pl.spatial(adata, img_key="hires", color=["total_counts", "n_genes_by_counts"])
In [18]:
plt.rcParams["figure.figsize"] = (4, 4)
sc.pl.spatial(adata, img_key="hires", color="clusters", size=1.5)
In [19]:
# By changing the alpha values of the spots, we can visualize better the underlying tissue morphology from the H&E image.
plt.rcParams["figure.figsize"] = (4, 4)
sc.pl.spatial(
adata,
img_key="hires",
color="clusters",
groups=["5", "9"],
crop_coord=[7000, 10000, 0, 6000],
alpha=0.5,
size=1.3,
)
In [20]:
print("Cluster marker genes :")
# Let us further inspect cluster 5, which occurs in small groups of spots across the image.
# Compute marker genes and plot a heatmap with expression levels of its top 10 marker genes across clusters.
plt.rcParams["figure.figsize"] = (4, 4)
sc.tl.rank_genes_groups(adata, "clusters", method="t-test")
sc.pl.rank_genes_groups_heatmap(adata, groups="9", n_genes=10, groupby="clusters")
Cluster marker genes :
ranking genes
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:02)
WARNING: dendrogram data not found (using key=dendrogram_clusters). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 50
Storing dendrogram info using `.uns['dendrogram_clusters']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: 0, 1, 2, etc.
var_group_labels: 9
In [21]:
# We see that CR2 recapitulates the spatial structure.
sc.pl.spatial(adata, img_key="hires", color=["clusters", "CR2"])
sc.pl.spatial(adata, img_key="hires", color=["COL1A2", "SYPL1"], alpha=0.7)
In [ ]:
In [22]:
print("A Merfish Example :")
import os
# Get the current working directory
current_folder = os.getcwd()
print("Current folder path:", current_folder)
A Merfish Example : Current folder path: C:\Users\user\Desktop\Spatial_Transcriptomics
In [23]:
# coordinates = pd.read_excel("./merfish/pnas.1912459116.sd15.xlsx", index_col=0)
# counts = sc.read_csv("./merfish/pnas.1912459116.sd12.csv").transpose()
import pandas as pd
# Read the coordinates file (assuming it's a tab-delimited text file)
coordinates = pd.read_csv("./merfish/pnas.1912459116.sd12.txt", sep='\t', index_col=0)
# Print rows and columns separately
print("Number of rows:", coordinates.shape[0])
print("Number of columns:", coordinates.shape[1])
print(coordinates.head(3))
Number of rows: 8900
Number of columns: 1368
B1_cell1 B1_cell2 B1_cell3 B1_cell4 B1_cell5 B1_cell6 B1_cell7 \
A1CF 0 0 1 1 0 0 0
A2M 4 2 1 1 1 1 7
A2ML1 0 0 0 0 0 0 0
B1_cell8 B1_cell9 B1_cell10 ... B3_cell314 B3_cell315 B3_cell316 \
A1CF 0 1 0 ... 0.0 0.0 0.0
A2M 5 0 0 ... 1.0 0.0 1.0
A2ML1 0 0 0 ... 1.0 0.0 0.0
B3_cell317 B3_cell318 B3_cell319 B3_cell320 B3_cell321 B3_cell322 \
A1CF 0.0 0.0 0.0 0.0 0.0 0.0
A2M 0.0 0.0 3.0 0.0 0.0 0.0
A2ML1 0.0 0.0 0.0 1.0 0.0 0.0
B3_cell323
A1CF 0.0
A2M 0.0
A2ML1 0.0
[3 rows x 1368 columns]
In [24]:
counts1 = pd.read_csv("./merfish/pnas.1912459116.sd15.sheet1.txt", sep='\t', index_col=0)
counts1 = counts1.transpose()
# Print rows and columns separately
print("Number of rows:", counts1.shape[0])
print("Number of columns:", counts1.shape[1])
print(counts1.head(3))
Number of rows: 2
Number of columns: 645
B1_cell1 B1_cell2 B1_cell3 B1_cell4 B1_cell5 \
x_microns -891.313702 -609.526668 -416.254156 -283.819862 -83.990122
y_microns -1464.550024 -1457.751892 -1451.438190 -1470.768830 -1472.083057
B1_cell6 B1_cell7 B1_cell8 B1_cell9 B1_cell10 \
x_microns 524.513139 1074.899607 1305.153414 1562.123421 1678.586896
y_microns -1468.855426 -1455.000790 -1457.479606 -1471.720412 -1451.114542
... B1_cell636 B1_cell637 B1_cell638 B1_cell639 \
x_microns ... 150.892218 -940.323054 1496.255135 326.456359
y_microns ... 1668.983661 1682.229687 1654.024194 1669.248722
B1_cell640 B1_cell641 B1_cell642 B1_cell643 B1_cell644 \
x_microns 1618.485479 -1404.280333 -1147.346414 1164.401171 31.242861
y_microns 1686.075261 1670.873055 1675.450682 1671.844596 1676.214779
B1_cell645
x_microns 1215.824420
y_microns 1676.306881
[2 rows x 645 columns]
In [25]:
counts2 = pd.read_csv("./merfish/pnas.1912459116.sd15.sheet2.txt", sep='\t', index_col=0)
counts2 = counts2.transpose()
# Print rows and columns separately
print("Number of rows:", counts2.shape[0])
print("Number of columns:", counts2.shape[1])
print(counts2.head(3))
Number of rows: 2
Number of columns: 400
B2_cell1 B2_cell2 B2_cell3 B2_cell4 B2_cell5 \
x_microns -1405.162152 1.527491 278.222805 634.878363 818.295448
y_microns -1444.964412 -1434.637254 -1457.511428 -1436.565978 -1441.332826
B2_cell6 B2_cell7 B2_cell8 B2_cell9 B2_cell10 \
x_microns 1365.447312 927.737251 -1087.390772 -808.822185 409.932018
y_microns -1450.768981 -1425.053558 -1422.314565 -1447.640139 -1416.904108
... B2_cell391 B2_cell392 B2_cell393 B2_cell394 \
x_microns ... 1289.254159 -960.552944 -103.604880 81.621873
y_microns ... 1664.402284 1644.348537 1669.308119 1678.015756
B2_cell395 B2_cell396 B2_cell397 B2_cell398 B2_cell399 \
x_microns -296.254218 -522.446692 1528.198721 543.802237 295.543885
y_microns 1676.169322 1691.621243 1692.063498 1688.379317 1676.841504
B2_cell400
x_microns 663.169781
y_microns 1694.803696
[2 rows x 400 columns]
In [26]:
counts3 = pd.read_csv("./merfish/pnas.1912459116.sd15.sheet3.txt", sep='\t', index_col=0)
counts3 = counts3.transpose()
# Print rows and columns separately
print("Number of rows:", counts3.shape[0])
print("Number of columns:", counts3.shape[1])
print(counts3.head(3))
Number of rows: 2
Number of columns: 323
B3_cell1 B3_cell2 B3_cell3 B3_cell4 B3_cell5 \
x_microns -1375.706616 -341.671385 -221.114651 98.235196 445.156746
y_microns -1455.720346 -1442.128506 -1452.552220 -1426.726277 -1425.201268
B3_cell6 B3_cell7 B3_cell8 B3_cell9 B3_cell10 \
x_microns 578.381732 1352.548263 -694.582342 -437.345786 908.690996
y_microns -1449.066879 -1422.223095 -1426.159138 -1417.409844 -1413.549813
... B3_cell314 B3_cell315 B3_cell316 B3_cell317 \
x_microns ... 1158.617861 631.548297 -452.503252 -581.253019
y_microns ... 1644.509204 1588.120442 1644.849210 1661.557899
B3_cell318 B3_cell319 B3_cell320 B3_cell321 B3_cell322 \
x_microns 482.200855 -1157.252043 -190.243030 697.138726 -1322.985397
y_microns 1655.232806 1662.172290 1629.152291 1686.252964 1674.899565
B3_cell323
x_microns -340.775496
y_microns 1693.324859
[2 rows x 323 columns]
In [27]:
# Get the current working directory
current_folder = os.getcwd()
print("Current folder path:", current_folder)
Current folder path: C:\Users\user\Desktop\Spatial_Transcriptomics
In [28]:
# Merfish example in : https://scanpy.readthedocs.io/en/stable/tutorials/spatial/basic-analysis.html
In [ ]: